home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/python -t
-
- # This library is free software; you can redistribute it and/or
- # modify it under the terms of the GNU Lesser General Public
- # License as published by the Free Software Foundation; either
- # version 2.1 of the License, or (at your option) any later version.
- #
- # This library is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- # Lesser General Public License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public
- # License along with this library; if not, write to the
- # Free Software Foundation, Inc.,
- # 59 Temple Place, Suite 330,
- # Boston, MA 02111-1307 USA
-
- # This file is part of urlgrabber, a high-level cross-protocol url-grabber
- # Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko
-
- """NAME
- urlgrabber - a simple client for the urlgrabber python package
-
- DESCRIPTION
- This is a thin client for the urlgrabber python package. It is
- provided mainly for helping debug the python package. It provides
- low-level access to most urlgrabber features from the shell.
-
- There are two types of options available for this program. They are
- 'client options' and 'module options'. Client options apply
- specifically to the behavior of this client, whereas module options
- are built-in options to the urlgrabber module. Both of these are
- avaible from the client command line, but they're documented a
- little differently. Client options are documented here, and module
- options are documented through the '--help <option>' syntax.
-
- CLIENT OPTIONS
- -h, --help
-
- In its short form, (-h) this provides a short usage discription.
- In its long form, it processes the remaining command line as help
- topics. Legal topics are:
- doc this info
- options a list of module options
- <option(s)> documentation for a given module option
- all documentation for all module options
-
- Examples:
- urlgrabber --help options
- urlgrabber --help copy_local
-
- -o FILE
-
- By default, downloaded data will be written to a file named using
- the basename of the url. For example,
- 'http://foo.com/index.html' will be written to 'index.html'. You
- can override this for your convenience or when necessary for urls
- like 'http://foo.com/'
-
- -O
-
- Print the local name of each downloaded file to STDOUT. This is
- helpful when not using the '-o' option, but is particularly
- useful when using --copy_local=0 (the default) on local files
- because local files will not be copied and the output filename
- will not be the same as that provided with '-o'.
-
- --repeat=N
-
- Grab each url N times. This is mostly for debugging keepalive.
-
- -p, --progress
-
- Use the default text-based progress meter.
-
- -v, --verbose=N
-
- Increment the verbosity level with each use of '-v' or set it
- directly with --verbose=N. Currently, distinct levels are 0-2.
- The default is 0.
-
- -d SPEC, --debug=SPEC
-
- Turn on internal urlgrabber debugging. This is equivalent (but
- overrides) running with the environment variable:
- URLGRABBER_DEBUG=SPEC
- SPEC can be of the form LEVEL,FILENAME, where
- LEVEL can be string (DEBUG, WARN, etc) or number.
- FILENAME can be the name of a file or "-" for STDOUT. The
- default is STDERR. Example: -d1,- logs everything to STDOUT.
- Note: this only works for python > 2.3 because it requires the
- logging package.
-
- -D
-
- A convenience alias for: --verbose=3 --progress --debug=INFO,-
-
- --profile
-
- Profile the actual fetching and print the results.
-
- """
-
- MAINHELP = """usage: urlgrabber [options] <url>
- urlgrabber - a simple client for the urlgrabber python package
-
- options:
- -h, --help print this message
- --help doc print basic intro and documentation
- --help options list available options to the grabber module
- --help <option> print documentation for a module option
- --help all print documentation for all module options
- --<option>=VAL specify a module option. VAL must be a python value,
- including quotes in the case of strings.
- e.g. --user_agent='"foobar/2.0"'
-
- -o FILE write output to FILE, otherwise the basename of the
- url will be used
- -O print the names of saved files to STDOUT
- --repeat=N grab each URL N times (mostly for debugging keepalive)
- -p, --progress use the default text progress meter
- -v increment the verbosity level (defaults to 0)
- --verbose=N set the verbosity level to N
- -d SPEC, --debug=SPEC
- turn on urlgrabber module debugging with
- SPEC=LEVEL,FILENAME. e.g. -d 1,debug.txt
- -D a convenience option equivalent to:
- --verbose=3 --progress --debug=INFO,-
- --profile profile the actual fetching and print the results
- """
-
- # $Id: urlgrabber,v 1.7 2006/12/08 00:14:16 mstenner Exp $
-
- import sys
- import getopt
- import re
-
- import urlgrabber.grabber
- from urlgrabber.grabber import URLGrabber, URLGrabberOptions, URLGrabError
-
- class client_options:
- def __init__(self):
- self.ug_options, self.ug_defaults = self.get_ug_options()
- self.process_command_line()
-
- def get_ug_options(self):
- ugo = URLGrabberOptions()
- ug_options = ['copy_local', 'keepalive', 'prefix', 'reget',
- 'data', 'quote', 'throttle', 'bandwidth',
- 'proxies', 'retry', 'retrycodes',
- 'range', 'user_agent',
- 'http_headers', 'ftp_headers',
- 'ssl_ca_cert', 'ssl_context',
- 'text', 'close_connection',
- 'cache_openers','timeout']
- options_exclude = ['delegate', 'interrupt_callback',
- 'failure_callback', 'urlparser', 'opener',
- 'checkfunc', 'progress_obj']
- for k in ugo.__dict__.keys():
- if (k not in ug_options) and (k not in options_exclude):
- ug_options.append(k)
- #print k
- ug_defaults = {}
- for k in list(ug_options):
- try:
- ug_defaults[k] = repr(getattr(ugo, k))
- except AttributeError:
- ug_options.remove(k)
- return ug_options, ug_defaults
-
- def process_command_line(self):
- short_options = 'vd:hoOpD'
- long_options = ['profile', 'repeat=', 'verbose=',
- 'debug=', 'help', 'progress']
- ug_long = [ o + '=' for o in self.ug_options ]
- optlist, args = getopt.getopt(sys.argv[1:], short_options,
- long_options + ug_long)
- self.verbose = 0
- self.debug = None
- self.outputfile = None
- self.localfile = 0
- self.repeat = 1
- self.progress = 0
- self.profile = 0
- self.ugops = {}
- self.args = args
-
- ug_dash = [ '--' + o for o in self.ug_options ]
- if not args: self.help(args)
- for (o, v) in optlist:
- if o == '--help' or o == '-h': self.help(args)
- if o == '--verbose': self.verbose = v
- if o == '-v': self.verbose += 1
- if o == '-o': self.outputfile = v
- if o == '-p' or o == '--progress': self.progress = 1
- if o == '-d' or o == '--debug': self.debug = v
- if o == '--profile': self.profile = 1
- if o == '-O': self.localfile = 1
- if o == '--repeat':
- try:
- self.repeat = int(v)
- if self.repeat < 1: raise ValueError()
- except ValueError:
- print 'ERROR: repeat value must be an int >= 1'
- sys.exit(1)
- if o == '-D':
- self.verbose = 3
- self.debug = "INFO,-"
- self.progress = 1
- if o in ug_dash:
- try:
- val = eval(v)
- except Exception, e:
- print "error processing option value: %s" % v
- print e
- sys.exit(1)
- else:
- self.ugops[o[2:]] = val
-
- if len(self.args) > 1 and self.outputfile is not None:
- print "ERROR: cannot use -o when grabbing multiple files"
- sys.exit(1)
-
- def help(self, args):
- if not args:
- print MAINHELP
- else:
- for a in args:
- m = getattr(self, 'help_'+a, None)
- if m is not None:
- m()
- elif a in self.ug_options:
- self.help_ug_option(a)
- else:
- print 'ERROR: no help on command "%s"' % a
- sys.exit(0)
-
- def help_doc(self):
- print __doc__
-
- def help_options(self):
- width = max(map(len, self.ug_options))
- format = ' %-' + str(width) + 's = %s'
- hformat = ' %-' + str(width) + 's %s'
- print hformat % ('OPTION', 'DEFAULT')
- print '-'*(width + 20)
- for k in self.ug_options:
- print format % (k, self.ug_defaults[k])
-
- def help_all(self):
- for k in self.ug_options:
- self.help_ug_option(k)
-
- def help_ug_option(self, option):
- help = ''
- m = re.search(r'^( '+option+'.*?)\s*^ {,2}\S',
- urlgrabber.grabber.__doc__, re.M|re.S)
- if m:
- print m.group(1)
- else:
- print ' %s: no help found for this option' % option
- print ''
-
- class ugclient:
- def __init__(self):
- op = client_options()
- self.op = op
- if op.verbose >= 2 and op.ugops:
- print "Module Options:"
- width = max(map(len, op.ugops.keys()))
- format = " %-" + str(width) + "s = %s"
- for k, v in op.ugops.items():
- print format % (k, repr(v))
-
- if op.debug:
- self.set_debug_logger(op.debug)
- if hasattr(urlgrabber.grabber, '_log_package_state'):
- urlgrabber.grabber._log_package_state()
-
- kwargs = dict(op.ugops)
- if op.progress:
- from urlgrabber.progress import text_progress_meter
- kwargs['progress_obj'] = text_progress_meter()
-
- self.g = URLGrabber(**kwargs)
-
- def run(self):
- for url in self.op.args:
- if self.op.verbose: print 'grabbing: %s' % url
- try:
- for i in range(0, self.op.repeat):
- f = self.g.urlgrab(url, self.op.outputfile)
- if self.op.localfile: print f
- except URLGrabError, e:
- print e
-
- def set_debug_logger(self, dbspec):
- try:
- dbinfo = dbspec.split(',')
- import logging
- level = logging._levelNames.get(dbinfo[0], None)
- if level is None: level = int(dbinfo[0])
- if level < 1: raise ValueError()
-
- formatter = logging.Formatter('%(asctime)s %(message)s')
- if len(dbinfo) > 1: filename = dbinfo[1]
- else: filename = ''
- if filename == '': handler = logging.StreamHandler(sys.stderr)
- elif filename == '-': handler = logging.StreamHandler(sys.stdout)
- else: handler = logging.FileHandler(filename)
- handler.setFormatter(formatter)
- DBOBJ = logging.getLogger('urlgrabber')
- DBOBJ.addHandler(handler)
- DBOBJ.setLevel(level)
- except (KeyError, ImportError, ValueError):
- DBOBJ = None
- urlgrabber.grabber.set_logger(DBOBJ)
-
- if __name__ == '__main__':
- ugc = ugclient()
- if ugc.op.profile:
- import profile
- import pstats
- prof = profile.Profile()
- prof.run('ugc.run()')
- pstats.Stats(prof).strip_dirs().sort_stats('cumulative').print_stats()
- else:
- ugc.run()
-